{
 "metadata": {
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.7-final"
  },
  "orig_nbformat": 2,
  "kernelspec": {
   "name": "python3",
   "display_name": "DataAnalysis"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2,
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "from tools import *\n",
    "%matplotlib inline"
   ]
  },
  {
   "source": [
    "# Chap 3 处理原始文本\n",
    "1.  如何访问文件内的文本？\n",
    "2.  如何将文档分割成单独的单词和标点符号，从而进行文本语料上的分析？\n",
    "3.  如何产生格式化的输出，并把结果保存在文件中？"
   ],
   "cell_type": "markdown",
   "metadata": {}
  },
  {
   "source": [
    "## 3.3. 使用 Unicode 进行文本处理"
   ],
   "cell_type": "markdown",
   "metadata": {}
  },
  {
   "source": [
    "### 3.3.1 什么是Unicode？ \n",
    "-   Unicode支持一百万种以上的字符。每个字符分配一个编号，称为编码点。\n",
    "-   文件中的文本都有特定编码的，需要将文本翻译成Unicode，叫做解码。\n",
    "\n",
    "图3-3：Unicode 的解码和编码"
   ],
   "cell_type": "markdown",
   "metadata": {}
  },
  {
   "source": [
    "### 3.3.2 从文件中提取已经编码的文本"
   ],
   "cell_type": "markdown",
   "metadata": {}
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": [
      "Pruska Biblioteka Państwowa. Jej dawne zbiory znane pod nazwą\n\"Berlinka\" to skarb kultury i sztuki niemieckiej. Przewiezione przez\nNiemców pod koniec II wojny światowej na Dolny Śląsk, zostały\nodnalezione po 1945 r. na terytorium Polski. Trafiły do Biblioteki\nJagiellońskiej w Krakowie, obejmują ponad 500 tys. zabytkowych\narchiwaliów, m.in. manuskrypty Goethego, Mozarta, Beethovena, Bacha.\n"
     ]
    }
   ],
   "source": [
    "# TypeError: expected str, bytes or os.PathLike object, not ZipFilePathPointer\n",
    "# 报这个错误是因为文件不存在，并不是真的遇到错误的类型\n",
    "# 报出这样的错误，可能是因为文件名不存在后，open再次调用一个空的path就会报错，需要重新初始化path再行\n",
    "path = nltk.data.find('corpora/unicode_samples/polish-lat2.txt')\n",
    "f = open(path, encoding='latin2')\n",
    "for line in f:\n",
    "    line = line.strip()\n",
    "    print(line)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": [
      "b'Pruska Biblioteka Pa\\\\u0144stwowa. Jej dawne zbiory znane pod nazw\\\\u0105'\nb'\"Berlinka\" to skarb kultury i sztuki niemieckiej. Przewiezione przez'\nb'Niemc\\\\xf3w pod koniec II wojny \\\\u015bwiatowej na Dolny \\\\u015al\\\\u0105sk, zosta\\\\u0142y'\nb'odnalezione po 1945 r. na terytorium Polski. Trafi\\\\u0142y do Biblioteki'\nb'Jagiello\\\\u0144skiej w Krakowie, obejmuj\\\\u0105 ponad 500 tys. zabytkowych'\nb'archiwali\\\\xf3w, m.in. manuskrypty Goethego, Mozarta, Beethovena, Bacha.'\n"
     ]
    }
   ],
   "source": [
    "# unicode_escape 是一个虚拟的编码，将所有非 ASCII 字符转换成 \\uXXXX 的形式\n",
    "# 编码点在 ASCII 码 0~127 的范围以外但是低于 256 的，使用\\xXX 的形式表示\n",
    "f = open(path, encoding='latin2')\n",
    "for line in f:\n",
    "    line = line.strip()\n",
    "    print(line.encode('unicode_escape'))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": [
      "ord('a')=  97\na= a\na.encode('utf8')=  b'a'\n"
     ]
    }
   ],
   "source": [
    "a=u'\\u0061'\n",
    "print(\"ord('a')= \", ord('a'))\n",
    "print('a=', a)\n",
    "print(\"a.encode('utf8')= \",a.encode('utf8'))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": [
      "ord('ń')=  324\nnacute=  ń\nnacute.encode('utf-8')=  b'\\xc5\\x84'\nnacute.encode('utf8')=  b'\\xc5\\x84'\n"
     ]
    }
   ],
   "source": [
    "nacute = '\\u0144'\n",
    "print(\"ord('ń')= \", ord('ń'))\n",
    "print('nacute= ', nacute)\n",
    "print(\"nacute.encode('utf-8')= \", nacute.encode('utf-8'))\n",
    "print(\"nacute.encode('utf8')= \", nacute.encode('utf8'))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": [
      "line=  Niemców pod koniec II wojny światowej na Dolny Śląsk, zostały\n",
      "\n",
      "line.encode('unicode_escape')=  b'Niemc\\\\xf3w pod koniec II wojny \\\\u015bwiatowej na Dolny \\\\u015al\\\\u0105sk, zosta\\\\u0142y\\\\n'\n",
      "line.encode('utf8')=  b'Niemc\\xc3\\xb3w pod koniec II wojny \\xc5\\x9bwiatowej na Dolny \\xc5\\x9al\\xc4\\x85sk, zosta\\xc5\\x82y\\n'\n",
      "--------------- >unicodedata< ---------------\n",
      "b'\\xc3\\xb3' = U+00f3 = LATIN SMALL LETTER O WITH ACUTE is ó\n",
      "b'\\xc5\\x9b' = U+015b = LATIN SMALL LETTER S WITH ACUTE is ś\n",
      "b'\\xc5\\x9a' = U+015a = LATIN CAPITAL LETTER S WITH ACUTE is Ś\n",
      "b'\\xc4\\x85' = U+0105 = LATIN SMALL LETTER A WITH OGONEK is ą\n",
      "b'\\xc5\\x82' = U+0142 = LATIN SMALL LETTER L WITH STROKE is ł\n",
      "line.find('zostały')=  54\n"
     ]
    }
   ],
   "source": [
    "# unicodedata 模块用于检查 Unicode 字符的属性\n",
    "import unicodedata\n",
    "\n",
    "lines = open(path, encoding='latin2').readlines()\n",
    "line = lines[2]\n",
    "print(\"line= \", line)\n",
    "print(\"line.encode('unicode_escape')= \",line.encode('unicode_escape'))\n",
    "print(\"line.encode('utf8')= \",line.encode('utf8'))\n",
    "show_subtitle(\"unicodedata\")\n",
    "for c in line:\n",
    "    if ord(c) > 127:\n",
    "        print('{} = U+{:04x} = {} is {}'.format(c.encode('utf8'), ord(c), unicodedata.name(c), c))\n",
    "print(\"line.find('zosta\\u0142y')= \",line.find('zosta\\u0142y'))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": [
      "line=  niemców pod koniec ii wojny światowej na dolny śląsk, zostały\n",
      "\n",
      "line.encode('unicode_escape')=  b'niemc\\\\xf3w pod koniec ii wojny \\\\u015bwiatowej na dolny \\\\u015bl\\\\u0105sk, zosta\\\\u0142y\\\\n'\n",
      "line.encode('utf8')=  b'niemc\\xc3\\xb3w pod koniec ii wojny \\xc5\\x9bwiatowej na dolny \\xc5\\x9bl\\xc4\\x85sk, zosta\\xc5\\x82y\\n'\n",
      "--------------- >unicodedata< ---------------\n",
      "b'\\xc3\\xb3' = U+00f3 = LATIN SMALL LETTER O WITH ACUTE is ó\n",
      "b'\\xc5\\x9b' = U+015b = LATIN SMALL LETTER S WITH ACUTE is ś\n",
      "b'\\xc5\\x9b' = U+015b = LATIN SMALL LETTER S WITH ACUTE is ś\n",
      "b'\\xc4\\x85' = U+0105 = LATIN SMALL LETTER A WITH OGONEK is ą\n",
      "b'\\xc5\\x82' = U+0142 = LATIN SMALL LETTER L WITH STROKE is ł\n"
     ]
    }
   ],
   "source": [
    "line = line.lower()\n",
    "print(\"line= \", line)\n",
    "print(\"line.encode('unicode_escape')= \",line.encode('unicode_escape'))\n",
    "print(\"line.encode('utf8')= \",line.encode('utf8'))\n",
    "show_subtitle(\"unicodedata\")\n",
    "for c in line:\n",
    "    if ord(c) > 127:\n",
    "        print('{} = U+{:04x} = {} is {}'.format(c.encode('utf8'), ord(c), unicodedata.name(c), c))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": [
      "m.group()=  światowej\n--------------- >word_tokenize()< ---------------\n"
     ]
    },
    {
     "output_type": "execute_result",
     "data": {
      "text/plain": [
       "['niemców',\n",
       " 'pod',\n",
       " 'koniec',\n",
       " 'ii',\n",
       " 'wojny',\n",
       " 'światowej',\n",
       " 'na',\n",
       " 'dolny',\n",
       " 'śląsk',\n",
       " ',',\n",
       " 'zostały']"
      ]
     },
     "metadata": {},
     "execution_count": 8
    }
   ],
   "source": [
    "import re\n",
    "\n",
    "m = re.search('\\u015b\\w*', line)\n",
    "print(\"m.group()= \",m.group())\n",
    "show_subtitle(\"word_tokenize()\")\n",
    "nltk.word_tokenize(line)"
   ]
  },
  {
   "source": [
    "### 3.3.3 在 Python 中使用本地编码\n",
    "需要在文件头加上字符串： `-*- coding: utf-8 -*-`"
   ],
   "cell_type": "markdown",
   "metadata": {}
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": [
      "sent=  Przewiezione przez Niemcow pod knoiec II wojny swiatowej na Dolny Slask, \nzostaly odnalezione po 1945 r. na terytorium Polski.\n"
     ]
    }
   ],
   "source": [
    "import re\n",
    "\n",
    "sent = \"\"\"Przewiezione przez Niemcow pod knoiec II wojny swiatowej na Dolny Slask, \n",
    "zostaly odnalezione po 1945 r. na terytorium Polski.\"\"\"\n",
    "print(\"sent= \", sent)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": [
      "bytes=  b'Przewiezione przez Niemcow pod knoiec II wojny swiatowej na Dolny Slask, \\nzostaly odnalezione po 1945 r. na terytorium Polski.'\nbytes.lower()=  b'przewiezione przez niemcow pod knoiec ii wojny swiatowej na dolny slask, \\nzostaly odnalezione po 1945 r. na terytorium polski.'\nbytes.decode('utf8')=  Przewiezione przez Niemcow pod knoiec II wojny swiatowej na Dolny Slask, \nzostaly odnalezione po 1945 r. na terytorium Polski.\n"
     ]
    }
   ],
   "source": [
    "bytes = sent.encode('utf8')\n",
    "print(\"bytes= \", bytes)\n",
    "print(\"bytes.lower()= \", bytes.lower())\n",
    "print(\"bytes.decode('utf8')= \", bytes.decode('utf8'))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": [
      "replaced=  Przewiezione przez Niemcow pod knoiec II wojny [sacute]wiatowej na Dolny [sacute]la[sacute]k, \nzo[sacute]taly odnalezione po 1945 r. na terytorium Pol[sacute]ki.\n"
     ]
    }
   ],
   "source": [
    "SACUTE = re.compile('s|S')\n",
    "replaced = re.sub(SACUTE, '[sacute]', sent)\n",
    "print(\"replaced= \", replaced)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": [
      "sent=  this is string example....wow!!!\nbytes=  b'this is string example....wow!!!'\nstr=  this is string example....wow!!!\n"
     ]
    }
   ],
   "source": [
    "# 字符 与 字符串 的转换\n",
    "sent = \"this is string example....wow!!!\"\n",
    "print(\"sent= \", sent)\n",
    "bytes = sent.encode('utf8')\n",
    "print(\"bytes= \", bytes)\n",
    "# bytes.encode('utf8')  # bytes：字节不能再次编码\n",
    "str = bytes.decode('utf8')\n",
    "print(\"str= \", str)\n",
    "# str.decode('utf8')  # str：字符串不能再次解码"
   ]
  }
 ]
}